from collections import Counter
def preprocess_datasets(ds_raw_train, ds_raw_valid, ds_raw_test, max_seq_length=None, batch_size=32):
tokenizer=tfds.deprecated.text.Tokenizer()
token_counts=Counter()
for example in ds_raw_train:
tokens=tokenizer.tokenize(example[0].numpy()[0])
if max_seq_length is not None:
tokens=tokens[-max_seq_length:]
token_counts.update(tokens)
print('어휘 사전 크기:', len(token_counts))
encoder=tfds.deprecated.text.TokenTextEncoder(token_counts)
def encode(text_tensor, label):
text=text_tensor.numpy()[0]
encoded_text=encoder.encode(text)
if max_seq_length is not None:
encoded_text=encoded_text[-max_seq_length:]
return encoded_text, label
def encode_map_fn(text, label):
return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))
ds_train=ds_raw_train.map(encode_map_fn)
ds_valid=ds_raw_valid.map(encode_map_fn)
ds_test=ds_raw_test.map(encode_map_fn)
train_data=ds_train.padded_batch(batch_size, padded_shapes=([-1], []))
valid_data=ds_valid.padded_batch(batch_size, padded_shapes=([-1], []))
test_data=ds_test.padded_batch(batch_size, padded_shapes=([-1], []))
return (train_data, valid_data, test_data, len(token_counts))
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
def build_run_model(embedding_dim, vocab_size, recurrent_type='SimpleRNN', n_recurrent_units=64, n_recurrent_layers=1, bidirectional=True):
tf.random.set_seed(1)
model=tf.keras.Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='embed_layer'))
for i in range(n_recurrent_layers):
return_sequences=(i<n_recurrent_layers-1)
if recurrent_type=='SimpleRNN':
recurrent_layer=SimpleRNN(units=n_recurrent_units, return_sequences=return_sequences, name='simprnn-layer-{}'.format(i))
elif recurrent_type=='LSTM':
recurrent_layer=LSTM(units=n_recurrent_units, return_sequences=return_sequences, name='lstm-layer-{}'.format(i))
elif recurrent_type=='GRU':
recurrent_layer=GRU(units=n_recurrent_units, return_sequences=return_sequences, name='gru-layer-{}'.format(i))
if bidirectional:
recurrent_layer=Bidirectional(recurrent_layer, name='bidir-'+recurrent_layer.name)
model.add(recurrent_layer)
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
return model